# This gives a first view of the matching errors. 
# It's based on regional matches. 
# The matching error is slightly higher because when we we aggregate the regional files
# We drop people with multiple matches. This would be the case if a worker has a 
# job in two different working regions and gets two different ids.


library("haven")
library("data.table")
setwd("C:/Users/Public/Documents/pseudo_id/")



files <- list.files(path = "ctrl")
files <- files[grep("match_all",files)]
setwd("ctrl/")

end <- length(files)
# end <- 1
ddd <- NULL
for (i in c(1:end)){
  file_i <- files[i]
  dd <- read_sas(file_i)
  dd$year <- year
  colnames(dd) <- tolower(colnames(dd))
  ddd <- rbind(ddd,dd)
}


str(ddd)
ddd <- data.table(ddd)

ttt <- dcast(ddd[,sum(count),by=list(match,nbmatch_yt_1)],match~nbmatch_yt_1)
ttt$pct_failed <- ttt$`not matched`/(ttt$`not matched`+ttt$`unique match`)

print(ttt)

